In [212]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.decomposition import PCA as sklearn_pca
import locale
from locale import atof
import warnings
from IPython.display import display
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_selection import f_regression
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import xlrd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.decomposition import PCA
from sklearn import decomposition
In [213]:
# Import FBI Raw Data
fbidata = pd.read_csv('https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/New_York_offenses/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013%20-%2013tbl8ny.csv', delimiter=",", thousands=',',decimal=".")
fbiraw = pd.DataFrame(fbidata)
fbiraw.head()
Out[213]:
In [214]:
#Transform FBI Raw Data
#Rename columns with row 3 from the original data set
fbiraw_t1 = fbiraw.rename(columns=fbiraw.iloc[3])
#Delete first three rows don´t contain data for the regression model
fbiraw_t2 = fbiraw_t1.drop(fbiraw_t1.index[0:4])
In [215]:
#Delete column "Rape (revised definition)1 as it contains no data
fbiraw_t2 = fbiraw_t2.drop('Rape\n(revised\ndefinition)1', axis = 1)
In [216]:
#Delete Arson Column as there is insufficient data
# 'The FBI does not publish arson data unless it receives data from either the agency or the state
# for all 12 months of the calendar year.'
fbiraw_t2 = fbiraw_t2.drop('Arson3', axis = 1)
In [217]:
#Clean tail from the data set
#Re-shape dataset excluding the last 3 rows of the dataset as they don´t contain relevant information for the model
fbiraw_t2 = fbiraw_t2[:-3]
#Change names in Columns
fbiraw_t2= fbiraw_t2.rename(columns={'Violent\ncrime': 'Violent Crime', 'Murder and\nnonnegligent\nmanslaughter': 'Murder','Rape\n(legacy\ndefinition)2': 'Rape', 'Robbery': 'Robbery', 'Aggravated\nassault': 'Assault', 'Property\ncrime': 'PropertyCrime', 'Burglary': 'Burglary', 'Larceny-\ntheft': 'Larceny & Theft', 'Motor\nvehicle\ntheft': 'MotorVehicleTheft'})
In [218]:
#Analyse missing information
fbiraw_t2.info()
In [219]:
#Change all columns from object to float
locale.setlocale(locale.LC_NUMERIC, '')
fbiraw_t2['Population'] = fbiraw_t2['Population'].apply(atof)
fbiraw_t2['Violent Crime'] = fbiraw_t2['Violent Crime'].apply(atof)
fbiraw_t2['Murder'] = fbiraw_t2['Murder'].apply(atof)
fbiraw_t2['Rape'] = fbiraw_t2['Rape'].apply(atof)
fbiraw_t2['Robbery'] = fbiraw_t2['Robbery'].apply(atof)
fbiraw_t2['Assault'] = fbiraw_t2['Assault'].apply(atof)
fbiraw_t2['PropertyCrime'] = fbiraw_t2['PropertyCrime'].apply(atof)
fbiraw_t2['Burglary'] = fbiraw_t2['Burglary'].apply(atof)
fbiraw_t2['Larceny & Theft'] = fbiraw_t2['Larceny & Theft'].apply(atof)
fbiraw_t2['MotorVehicleTheft'] = fbiraw_t2['MotorVehicleTheft'].apply(atof)
fbiraw_t2.info()
In [220]:
#Reindex the dataframe
fbiraw_t3 = fbiraw_t2.reset_index(drop=True)
fbiraw_t2.head()
Out[220]:
In [221]:
#Extract only the columns that are needed
fbiraw_t3 = fbiraw_t2
In [222]:
#Eliminate outliers
#fbiraw_t3 = fbiraw_t3[fbiraw_t3.PropertyCrime < 450].reset_index(drop=True)
#Describe the dataset
fbiraw_t3.describe()
Out[222]:
In [223]:
#Print length of dataset and sort values by Population to see how many datapoints are excluded
print(len(fbiraw_t3), len(fbiraw_t2) - len(fbiraw_t3))
fbiraw_t3.sort_values('PropertyCrime',ascending=False).head()
Out[223]:
In [224]:
#Convert Robbery into a categorical feature
fbiraw_t3.loc[fbiraw_t3['Robbery'] > 0, 'Robbery'] = 1
In [225]:
#Convert Murder into a categorical feature
fbiraw_t3.loc[fbiraw_t3['Murder'] > 0, 'Murder'] = 1
In [226]:
#Transform dataset into final dataset with features
fbidata = fbiraw_t3.drop('City',axis=1)
In [227]:
#Create new features
#10
fbidata['logpopulation'] = np.log(fbidata.Population)
#11
fbidata['TotalCrime'] = np.sqrt(fbidata['Violent Crime']*fbidata['PropertyCrime']+1)
#12
fbidata['PropertyCrimeperAssault'] = fbidata['PropertyCrime']*(fbidata['Assault']+1)
#13
fbidata['Theft'] = np.sqrt(fbidata['Larceny & Theft']*fbidata['MotorVehicleTheft'])
#14
fbidata['Unit Larceny & Theft'] = (fbidata['Larceny & Theft']/fbidata['Population'])*100
#14
fbidata['logviolentcrimes'] = np.log(fbidata['Violent Crime']+1)
In [228]:
names = fbidata.columns
fbidata_scaled = pd.DataFrame(preprocessing.scale(fbidata), columns = names)
In [229]:
sns.heatmap(fbidata_scaled.corr())
plt.show()
In [230]:
#Set up the outcome variable and predictors
y = fbidata['Murder']
#Iteration1: X = fbidata_scaled[['Population', 'Violent Crime','Rape','Robbery','Assault', 'PropertyCrime','Burglary','Larceny & Theft','MotorVehicleTheft','logpopulation','TotalCrime','PropertyCrimeperAssault','Theft','Unit Larceny & Theft','logviolentcrimes']]
#Iteration 2: X = fbidata_scaled[['Rape','Robbery','Burglary','logpopulation','PropertyCrimeperAssault']]
#Iteration 3: X = fbidata_scaled[['Rape','Burglary','logpopulation']]
#Iteration 4:
X = fbidata_scaled[['Rape','Robbery','Burglary','logpopulation']]
names = ['Rape','Robbery','Burglary','logpopulation']
In [231]:
#Initiating the cross validation generator, N splits = 10
kf = KFold(10)
In [232]:
# Declare a logistic regression classifier.
lr = LogisticRegression(C=1)
# Fit the model.
fit = lr.fit(X, y)
print(fit.coef_)
print(fit.intercept_)
cross_val_score(lr, X, y, cv=kf).mean()
Out[232]:
In [233]:
# Build up the correlation mtrix
Z = X
correlation_matrix = Z.corr()
In [234]:
#Eigenvectores & Eigenvalues
eig_vals, eig_vecs = np.linalg.eig(correlation_matrix)
# Inspecting the eigenvalues and eigenvectors.
for i in range(len(eig_vals)):
eigvecs = eig_vecs[:, i].reshape(1, len(X.columns)).T
print('Eigenvector {}: \n{}'.format(i + 1, eigvecs))
print('Eigenvalue {}: {}'.format(i + 1, eig_vals[i]))
print(40 * '-')
sklearn_pca = PCA(n_components=len(X.columns))
Y_sklearn = sklearn_pca.fit_transform(correlation_matrix)
print(
'The percentage of total variance in the dataset explained by each',
'component from Sklearn PCA.\n',
sklearn_pca.explained_variance_ratio_
)
In [235]:
#From the Scree plot.
plt.plot(eig_vals)
plt.show()
In [249]:
# Create a scaler object
sc = StandardScaler()
# Fit the scaler to the features and transform
X_std = sc.fit_transform(fbidata_scaled)
# Create a PCA object with the 10 components as a parameter
pca = decomposition.PCA(n_components=4)
# Fit the PCA and transform the data
X_std_pca = pca.fit_transform(X_std)
# View the new feature data's shape
X_std_pca.shape
Out[249]:
In [250]:
# Create a new dataframe with the new features
X1 = pd.DataFrame(X_std_pca)
In [251]:
# Build heatmap based on the Correlation Matrix
sns.heatmap(X1.corr())
plt.show()
In [239]:
# create the RFE model and select features
nfeatures = (len(X.columns))
rfe = RFE(lr,nfeatures)
fit = rfe.fit(X,y)
# summarize the selection of the features
result_RFE = pd.DataFrame(list(zip(X.head(0), rfe.ranking_, rfe.support_)),columns=['Features','Ranking','Support'] )
result_RFE.sort_values('Ranking')
Out[239]:
In [240]:
#Feature Selection using Lasso
for lambd in [x * 0.01 for x in range(1, 100)]:
lasso = Lasso(alpha=lambd)
lasso_coef = lasso.fit(X, y).coef_
plt.xticks(range(len(names)), names, rotation=90)
plt.ylabel('Coefficients')
plt.plot(range(len(names)), lasso_coef)
plt.show()
Lasso feature selection shows that 'logpopulation','Unit Larceny & Theft' & 'logviolentcrimes' are the most significant features
In [241]:
# Store estimates
scores = np.append(cross_val_score(lr, X, y, cv=kf).mean(),[0])
#Run the model for many alphas
for lambd in range(1, 10):
lr = LogisticRegression(C=lambd)
lr.fit(X, y)
scores = np.row_stack((
scores,
np.append(cross_val_score(lr, X, y, cv=kf).mean(),[lambd])))
#Make the data pretty
scores_df = pd.DataFrame(
scores,columns = ['R2','lambda'])
scores_df.sort_values(by='lambda', inplace=True, ascending=True)
scores_df
Out[241]:
In [242]:
#Plot changes in R2 vs C
plt.plot(scores_df['lambda'], scores_df['R2'])
plt.title('Logistic Regression')
plt.xlabel('lambda')
plt.ylabel('R2')
Out[242]:
In [243]:
# Declare a Ridge regression classifier.
ridgeregr = Ridge(alpha = 9)
# Fit the model.
fit = ridgeregr.fit(X, y)
print(fit.coef_)
print(fit.intercept_)
cross_val_score(ridgeregr, X, y, cv=kf).mean()
Out[243]:
In [244]:
# Prepare Ridge Regression
# Store estimates
scores = np.append(cross_val_score(ridgeregr, X, y, cv=kf).mean(),[0])
#Run the model for many alphas
for lambd in range(1, 100):
ridgeregr = Ridge(alpha=lambd)
ridgeregr.fit(X, y)
scores = np.row_stack((
scores,np.append(cross_val_score(ridgeregr, X, y, cv=kf).mean(),[lambd])))
# Make the data pretty
scores_df = pd.DataFrame(
scores,columns = ['R2','lambd'])
scores_df.sort_values(by='lambd', inplace=True, ascending=True)
In [245]:
#Plot changes in R2 vs lambda
plt.plot(scores_df['lambd'], scores_df['R2'])
plt.title('Ridge Regression')
plt.xlabel('lambd')
plt.ylabel('R2')
Out[245]:
In [246]:
# Declare a Lasso regression classifier.
lass = linear_model.Lasso(alpha = 0.01 )
# Fit the model.
fit = lass.fit(X, y)
print(fit.coef_)
print(fit.intercept_)
cross_val_score(lass, X, y, cv=kf).mean()
Out[246]:
In [247]:
# Prepare Lasso Regression
# Store estimates
scores = np.append(cross_val_score(lass, X, y, cv=kf).mean(),[0])
#Run the model for many alphas
for lambd in [x * 0.01 for x in range(1, 10)]:
lass = linear_model.Lasso(alpha=lambd)
lass.fit(X, y)
scores = np.row_stack((
scores,
np.append(cross_val_score(lass, X, y, cv=kf).mean(),[lambd])))
#Make the data pretty
scores_df = pd.DataFrame(
scores,columns = ['R2','lambd'])
scores_df.sort_values(by='lambd', inplace=True, ascending=True)
In [248]:
#Plot changes in R2 vs lambda
plt.plot(scores_df['lambd'], scores_df['R2'])
plt.title('Lasso Regression')
plt.xlabel('lambd')
plt.ylabel('R2')
Out[248]:
Be clear about the decisions you made that led to these models (feature selection, regularization parameter selection, model evaluation criteria) and why you think that particular model is the best of the three.
From the three models I would choose the model based on the cross-validation score. In this case, the best model would be the Ridge Regression with an alpha of 9 giving a R2 = 80% approx. The Logistic Regression gives a value of 89% with a C of 6 and the Lasso regression, to achieve the max R2 requires to have an alhpa that is closer to an OLS.
The selected features have been: ['Robbery','Rape','Burglary']. They have been chosen considering the increase of information that they bring to R2.
Models have been evaluated following the R2 criteria
Also reflect on the strengths and limitations of regression as a modeling approach.
Were there things you couldn't do but you wish you could have done?
In [ ]: